/*
 * Copyright (c) 2008-2015 Travis Geiselbrecht
 *
 * Use of this source code is governed by a MIT-style
 * license that can be found in the LICENSE file or at
 * https://opensource.org/licenses/MIT
 */
#include <lk/asm.h>
#include <arch/arm/cores.h>
#include <arch/arm/mmu.h>

#if WITH_KERNEL_VM
#include <kernel/vm.h>
#endif

.section ".text.boot"
.globl _start
_start:
    b   platform_reset
    b   arm_undefined
    b   arm_syscall
    b   arm_prefetch_abort
    b   arm_data_abort
    b   arm_reserved
    b   arm_irq
    b   arm_fiq
#if WITH_SMP
    b   arm_reset
#endif

.weak platform_reset
platform_reset:
    /* Fall through for the weak symbol */

.globl arm_reset
arm_reset:
#if ARM_WITH_HYP
    /* if in HYP mode, move to SVC */
    mrs r12, cpsr
    and r12, r12, #0x1f
    cmp r12, #0x1a
    bleq arm32_hyp_to_svc
#endif // ARM_WITH_HYP

    /* do some early cpu setup */
    mrc     p15, 0, r12, c1, c0, 0
    /* i/d cache disable, mmu disabled */
    bic     r12, #(1<<12)
    bic     r12, #(1<<2 | 1<<0)
#if WITH_KERNEL_VM
    /* enable caches so atomics and spinlocks work */
    orr     r12, r12, #(1<<12)
    orr     r12, r12, #(1<<2)
#endif // WITH_KERNEL_VM
    mcr     p15, 0, r12, c1, c0, 0

    /* calculate the physical offset from our eventual virtual location */
.Lphys_offset:
    ldr     r4, =.Lphys_offset
    adr     r11, .Lphys_offset
    sub     r11, r11, r4

#if WITH_SMP
    /* figure out our cpu number */
    mrc     p15, 0, r12, c0, c0, 5 /* read MPIDR */

    /* mask off the bottom bits to test cluster number:cpu number */
    ubfx    r12, r12, #0, #SMP_CPU_ID_BITS

    /* if we're not cpu 0:0, fall into a trap and wait */
    teq     r12, #0
    movne   r0, r12
    bne     arm_secondary_setup
#endif // WITH_SMP

#if WITH_CPU_EARLY_INIT
    /* call platform/arch/etc specific init code */
    bl      __cpu_early_init
#endif // WITH_CPU_EARLY_INIT

#if WITH_NO_PHYS_RELOCATION
    /* assume that image is properly loaded in physical memory */
#else
    /* see if we need to relocate to our proper location in physical memory */
    adr     r4, _start                           /* this emits sub r4, pc, #constant */
    ldr     r5, =(MEMBASE + KERNEL_LOAD_OFFSET)  /* calculate the binary's physical load address */
    subs    r12, r4, r5                          /* calculate the delta between where we're loaded and the proper spot */
    beq     .Lrelocate_done

    /* we need to relocate ourselves to the proper spot */
    ldr     r6, =__data_end
    ldr     r7, =(KERNEL_BASE - MEMBASE)
    sub     r6, r7
    add     r6, r12

.Lrelocate_loop:
    ldr     r7, [r4], #4
    str     r7, [r5], #4
    cmp     r4, r6
    bne     .Lrelocate_loop

    /* we're relocated, jump to the right address */
    sub     pc, r12
    nop     /* skipped in the add to pc */

    /* recalculate the physical offset */
    sub     r11, r11, r12

.Lrelocate_done:
#endif // !WITH_NO_PHYS_RELOCATION

#if ARCH_HAS_MMU
.Lsetup_mmu:

    /* set up the mmu according to mmu_initial_mappings */

    /* load the base of the translation table and clear the table */
    ldr     r4, =arm_kernel_translation_table
    add     r4, r4, r11
        /* r4 = physical address of translation table */

    mov     r5, #0
    mov     r6, #0

    /* walk through all the entries in the translation table, setting them up */
0:
    str     r5, [r4, r6, lsl #2]
    add     r6, #1
    cmp     r6, #4096
    bne     0b

    /* load the address of the mmu_initial_mappings table and start processing */
    ldr     r5, =mmu_initial_mappings
    add     r5, r5, r11
        /* r5 = physical address of mmu initial mapping table */

.Linitial_mapping_loop:
    ldmia   r5!, { r6-r10 }
        /* r6 = phys, r7 = virt, r8 = size, r9 = flags, r10 = name */

    /* round size up to 1MB alignment */
    ubfx        r10, r6, #0, #20
    add     r8, r8, r10
    add     r8, r8, #(1 << 20)
    sub     r8, r8, #1

    /* mask all the addresses and sizes to 1MB boundaries */
    lsr     r6, #20  /* r6 = physical address / 1MB */
    lsr     r7, #20  /* r7 = virtual address / 1MB */
    lsr     r8, #20  /* r8 = size in 1MB chunks */

    /* if size == 0, end of list */
    cmp     r8, #0
    beq     .Linitial_mapping_done

    /* set up the flags */
    ldr     r10, =MMU_KERNEL_L1_PTE_FLAGS
    teq     r9, #MMU_INITIAL_MAPPING_FLAG_UNCACHED
    ldreq   r10, =MMU_INITIAL_MAP_STRONGLY_ORDERED
    beq     0f
    teq     r9, #MMU_INITIAL_MAPPING_FLAG_DEVICE
    ldreq   r10, =MMU_INITIAL_MAP_DEVICE
        /* r10 = mmu entry flags */

0:
    orr     r12, r10, r6, lsl #20
        /* r12 = phys addr | flags */

    /* store into appropriate translation table entry */
    str     r12, [r4, r7, lsl #2]

    /* loop until we're done */
    add     r6, #1
    add     r7, #1
    subs    r8, #1
    bne     0b

    b       .Linitial_mapping_loop

.Linitial_mapping_done:

#if MMU_WITH_TRAMPOLINE
    /* move arm_kernel_translation_table address to r8 and
     * set cacheable attributes on translation walk
     */
    orr     r8, r4, #MMU_TTBRx_FLAGS

    /* Prepare tt_trampoline page table */
    /* Calculate pagetable physical addresses */
    ldr     r4, =tt_trampoline  /* r4 = tt_trampoline vaddr */
    add     r4, r4, r11     /* r4 = tt_trampoline paddr */

    /* Zero tt_trampoline translation tables */
    mov     r6, #0
    mov     r7, #0
1:
    str     r7, [r4, r6, lsl#2]
    add     r6, #1
    cmp     r6, #0x1000
    blt     1b

    /* Setup 1M section mapping at
     * phys  -> phys   and
     * virt  -> phys
     */
    lsr     r6, pc, #20     /* r6 = paddr index */
    ldr     r7, =MMU_KERNEL_L1_PTE_FLAGS
    add     r7, r7, r6, lsl #20 /* r7 = pt entry */

    str     r7, [r4, r6, lsl #2]    /* tt_trampoline[paddr index] = pt entry */

    rsb     r6, r11, r6, lsl #20    /* r6 = vaddr */
    str     r7, [r4, r6, lsr #(20 - 2)] /* tt_trampoline[vaddr index] = pt entry */
#endif // MMU_WITH_TRAMPOLINE

    /* set up the mmu */
    bl      .Lmmu_setup
#endif // ARCH_HAS_MMU

    /* at this point we're running at our final location in virtual memory (if enabled) */
.Lstack_setup:
    /* set up the stack for irq, fiq, abort, undefined, system/user, and lastly supervisor mode */
    mov     r12, #0

    cpsid   i,#0x12       /* irq */
    mov     sp, r12

    cpsid   i,#0x11       /* fiq */
    mov     sp, r12

    cpsid   i,#0x17       /* abort */
    mov     sp, r12

    cpsid   i,#0x1b       /* undefined */
    mov     sp, r12

    cpsid   i,#0x1f       /* system */
    mov     sp, r12

    cpsid   i,#0x13       /* supervisor */
    ldr     r12, =abort_stack
    add     r12, #ARCH_DEFAULT_STACK_SIZE
    mov     sp, r12

    /* stay in supervisor mode from now on out */

    /* copy the initialized data segment out of rom if necessary */
    ldr     r4, =__data_start_rom
    ldr     r5, =__data_start
    ldr     r6, =__data_end

    cmp     r4, r5
    beq     .L__do_bss

.L__copy_loop:
    cmp     r5, r6
    ldrlt   r7, [r4], #4
    strlt   r7, [r5], #4
    blt     .L__copy_loop

.L__do_bss:
    /* clear out the bss */
    ldr     r4, =__bss_start
    ldr     r5, =_end
    mov     r6, #0
.L__bss_loop:
    cmp     r4, r5
    strlt   r6, [r4], #4
    blt     .L__bss_loop

    bl      lk_main
    b       .

#if WITH_KERNEL_VM
    /* per cpu mmu setup, shared between primary and secondary cpus
       args:
       r4 == translation table physical
       r8 == final translation table physical (if using trampoline)
    */
.Lmmu_setup:
    /* Invalidate TLB. The value in r0 is ignored */
    mcr     p15, 0, r0, c8, c7, 0
    dsb     sy
    isb

    /* Write 0 to TTBCR */
    mov     r12, #0
    mcr     p15, 0, r12, c2, c0, 2
    isb

    /* Set cacheable attributes on translation walk */
    orr     r12, r4, #MMU_TTBRx_FLAGS

    /* Write ttbr with phys addr of the translation table */
    mcr     p15, 0, r12, c2, c0, 0  // TTBR0
    isb

    /* Write DACR */
    mov     r12, #0x1
    mcr     p15, 0, r12, c3, c0, 0
    isb

    /* Read SCTLR into r12 */
    mrc     p15, 0, r12, c1, c0, 0

    /* Disable TRE/AFE */
    bic     r12, #(1<<29 | 1<<28)

    /* Turn on the MMU */
    orr     r12, #0x1

    /* Write back SCTLR */
    mcr     p15, 0, r12, c1, c0, 0
    isb

    /* Jump to virtual code address */
    ldr     pc, =1f
1:

#if MMU_WITH_TRAMPOLINE
    /* Switch to main page table */
    mcr     p15, 0, r8, c2, c0, 0
    isb
#endif // MMU_WITH_TRAMPOLINE

    /* Invalidate TLB. The value in r0 is ignored */
    mcr     p15, 0, r0, c8, c7, 0
    dsb     sy
    isb

    /* assume lr was in physical memory, adjust it before returning */
    sub     lr, r11
    bx      lr
#endif // WITH_KERNEL_VM

#if WITH_SMP
    /* secondary cpu entry point */
    /* r0 holds cpu number */
    /* r11 hold phys offset */
FUNCTION(arm_secondary_setup)
    /* all other cpus, trap and wait to be released */
1:
    wfe
    ldr     r12, =arm_boot_cpu_lock
    add     r12, r12, r11
    ldr     r12, [r12]
    cmp     r12, #0
    bne     1b

    and     r1, r0, #0xff
    cmp     r1, #(1 << SMP_CPU_CLUSTER_SHIFT)
    bge     unsupported_cpu_trap
    bic     r0, r0, #0xff
    orr     r0, r1, r0, LSR #(8 - SMP_CPU_CLUSTER_SHIFT)

    cmp     r0, #SMP_MAX_CPUS
    bge     unsupported_cpu_trap
    mov     r5, r0 /* save cpu num */

    /* set up the stack for irq, fiq, abort, undefined, system/user, and lastly supervisor mode */
    mov     r1, #0
    cpsid   i,#0x12       /* irq */
    mov     sp, r1

    cpsid   i,#0x11       /* fiq */
    mov     sp, r1

    cpsid   i,#0x17       /* abort */
    mov     sp, r1

    cpsid   i,#0x1b       /* undefined */
    mov     sp, r1

    cpsid   i,#0x1f       /* system */
    mov     sp, r1

    cpsid   i,#0x13       /* supervisor */
    ldr     r1, =abort_stack
    mov     r2, #ARCH_DEFAULT_STACK_SIZE
    add     r0, #1
    mul     r2, r2, r0
    add     r1, r2

    mov     sp, r1

#if WITH_KERNEL_VM
    /* load the physical base of the translation table and clear the table */
    ldr     r4, =arm_kernel_translation_table
    add     r4, r4, r11

#if MMU_WITH_TRAMPOLINE
    /* move arm_kernel_translation_table address to r8 and
     * set cacheable attributes on translation walk
     */
    orr     r8, r4, #MMU_TTBRx_FLAGS

    /* Prepare tt_trampoline page table */
    /* Calculate pagetable physical addresses */
    ldr     r4, =tt_trampoline  /* r4 = tt_trampoline vaddr */
    add     r4, r4, r11     /* r4 = tt_trampoline paddr */
#endif // MMU_WITH_TRAMPOLINE

    /* set up the mmu on this cpu and switch to virtual memory */
    bl      .Lmmu_setup
#endif // WITH_KERNEL_VM

    /* stay in supervisor and call into arm arch code to continue setup */
    mov     r0, r5
    bl      arm_secondary_entry

    /* cpus above the number we claim to support get trapped here */
unsupported_cpu_trap:
    wfe
    b       unsupported_cpu_trap
#endif // WITH_SMP

#if ARM_WITH_HYP
arm32_hyp_to_svc:
    mrs r12, cpsr
    bic r12, #0x1f      // clear mode bits
    orr r12, r12, #0x13 // set mode bits to SVC
    msr SPSR_hyp, r12
    msr ELR_hyp, lr
    eret                // "restore" the mode, and return
#endif // ARM_WITH_HYP

.ltorg

#if WITH_KERNEL_VM && MMU_WITH_TRAMPOLINE
.section ".bss.prebss.translation_table"
.align 14
DATA(tt_trampoline)
    .skip 16384
#endif // WITH_KERNEL_VM && MMU_WITH_TRAMPOLINE

.data
.align 2
